2  Statistical Learning

Author

phonchi

Published

September 12, 2022

Open In Colab


2.1 Basic Commands

Refer to Numpy notebook for more detail

# imports and setup
import math
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # for 3D plots
import seaborn as sns
%matplotlib inline

pd.set_option('precision', 2) # number precision for pandas
plt.style.use('seaborn-white')
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
# vector
x = [1,3,2,5] # do not use this
x = np.array([1, 6, 2]) # use ndarray instead
y = np.array([1, 4, 3])
x.shape, y.shape
((3,), (3,))
# array operations
x + y
array([ 2, 10,  5])
# matrix creation
x = np.array([[1,2],[3,4]]) # row major, do not use np.matrix
x, np.isfortran(x)
(array([[1, 2],
        [3, 4]]), False)
x = np.asfortranarray(np.array(range(1,5)).reshape(2,2)) #do not do this
x, np.isfortran(x) 
(array([[1, 2],
        [3, 4]]), True)
np.array(range(1,5)).reshape(2,2)
array([[1, 2],
       [3, 4]])
np.sqrt(x) # c.f. https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.sqrtm.html
array([[1.        , 1.41421356],
       [1.73205081, 2.        ]])
print(np.power(x, 2))
print(x**2)
[[ 1  4]
 [ 9 16]]
[[ 1  4]
 [ 9 16]]
# random normal distribution & correlation
x = np.random.randn(50)
y = x + np.random.normal(loc=50, scale=.1, size=50)
pearsonr(x, y)[0]
0.9961062617773434
# random seed and basic statistical functions
np.random.seed(3)
y = np.random.randn(50)
y.mean(), y.var(), np.sqrt(y.var()), y.std()
(-0.2939268200794072,
 0.9969329133079725,
 0.9984652789696657,
 0.9984652789696657)

2.2 Graphics

Here, we review some of the command we have learned using Seaborn

x = np.random.randn(100)
y = np.random.randn(100)

# seaborn scatterplot
p = sns.jointplot(x=x, y=y, kind='scatter')
p.set_axis_labels(xlabel='this is x axis', ylabel='this is y axis')

We will often want to save the output of an Python plot. The command that we use to do this will be the savefig().

p.savefig("Figure.pdf") #depends on the file extension
p.savefig("Figure.jpg")
# create a sequence of numbers
x = np.arange(1, 11)
x
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
# linearly spaced numbers
x = np.linspace(-np.pi, np.pi, num=50)
x
array([-3.14159265, -3.01336438, -2.88513611, -2.75690784, -2.62867957,
       -2.5004513 , -2.37222302, -2.24399475, -2.11576648, -1.98753821,
       -1.85930994, -1.73108167, -1.60285339, -1.47462512, -1.34639685,
       -1.21816858, -1.08994031, -0.96171204, -0.83348377, -0.70525549,
       -0.57702722, -0.44879895, -0.32057068, -0.19234241, -0.06411414,
        0.06411414,  0.19234241,  0.32057068,  0.44879895,  0.57702722,
        0.70525549,  0.83348377,  0.96171204,  1.08994031,  1.21816858,
        1.34639685,  1.47462512,  1.60285339,  1.73108167,  1.85930994,
        1.98753821,  2.11576648,  2.24399475,  2.37222302,  2.5004513 ,
        2.62867957,  2.75690784,  2.88513611,  3.01336438,  3.14159265])

We will now create some more sophisticated plots. We create three array here

  • A vector of the x values (the first dimension),
  • A vector of the y values (the second dimension), and
  • An 2D array whose elements correspond to the z value (the third dimension) for each pair of (x, y) coordinates.
plt.figure(figsize=(6,6))

x = np.linspace(-np.pi, np.pi, num=50)
y = x

# simulating R outer function
def pf(a, b):
    return math.cos(b) / (1 + a**2)

f = np.empty((len(x), len(y)))
 
for i in range(len(x)):
    for j in range(len(y)):
        f[i,j] = pf(x[i], y[j])

        
# contour plot
cp = plt.contour(x, y, f.T, 45, cmap='viridis')
plt.clabel(cp, inline=1, fontsize=10);

# contour 2
plt.figure(figsize=(6,6))

fa = (f - f.T)/2
cp = plt.contour(x, y, fa.T, 15, cmap='viridis')
plt.clabel(cp, inline=1, fontsize=10);

# heatmap
plt.figure(figsize=(6,6))
cp = plt.contourf(x, y, fa.T, 15, cmap='viridis') #adiidtional f
plt.clabel(cp, inline=1, fontsize=10)
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x7f7c932f7a50>

# 3d perspective

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(x, y, fa.T, cmap='viridis')
<mpl_toolkits.mplot3d.art3d.Line3DCollection at 0x7f7c932129d0>

2.3 Indexing Data

# matrix creation (R equivalent of matrix(1:16, 4 ,4))
A = np.arange(16).reshape(4, 4).transpose()
A
array([[ 0,  4,  8, 12],
       [ 1,  5,  9, 13],
       [ 2,  6, 10, 14],
       [ 3,  7, 11, 15]])
A[1, 2]
9
# select a range of rows and columns
A[0:3, 1:4]
array([[ 4,  8, 12],
       [ 5,  9, 13],
       [ 6, 10, 14]])
# select a range of rows and all columns
A[0:2,:]
array([[ 0,  4,  8, 12],
       [ 1,  5,  9, 13]])
# select all rows and a range of columns
A[:,0:2]
array([[0, 4],
       [1, 5],
       [2, 6],
       [3, 7]])
A[0,:]
array([ 0,  4,  8, 12])
# shape of the matrix
A.shape
(4, 4)

2.4 Loading Data

The data set also includes a number of missing observations, indicated by a question mark ?. Missing values are a common occurrence in real data sets.

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
# read csv data with pandas into dataframe, explicitly setting na_values.
# pandas read_xxx functions infer datatypes, headers, dates, etc. 
# without explicit declarations
Auto = pd.read_csv('/content/drive/MyDrive/Lab/Data/Auto_origin.csv', na_values=['?'])
Auto
mpg cylinders displacement horsepower weight acceleration year origin name
0 18.0 8 307.0 130.0 3504 12.0 70 1 chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693 11.5 70 1 buick skylark 320
2 18.0 8 318.0 150.0 3436 11.0 70 1 plymouth satellite
3 16.0 8 304.0 150.0 3433 12.0 70 1 amc rebel sst
4 17.0 8 302.0 140.0 3449 10.5 70 1 ford torino
... ... ... ... ... ... ... ... ... ...
392 27.0 4 140.0 86.0 2790 15.6 82 1 ford mustang gl
393 44.0 4 97.0 52.0 2130 24.6 82 2 vw pickup
394 32.0 4 135.0 84.0 2295 11.6 82 1 dodge rampage
395 28.0 4 120.0 79.0 2625 18.6 82 1 ford ranger
396 31.0 4 119.0 82.0 2720 19.4 82 1 chevy s-10

397 rows × 9 columns

# dropping rows (axis-0) where there are NA values (inplace)
Auto.dropna(axis=0, inplace=True)
Auto.shape
(392, 9)
Auto.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   year          392 non-null    int64  
 7   origin        392 non-null    int64  
 8   name          392 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 30.6+ KB
Auto.describe()
mpg cylinders displacement horsepower weight acceleration year origin
count 392.00 392.00 392.00 392.00 392.00 392.00 392.00 392.00
mean 23.45 5.47 194.41 104.47 2977.58 15.54 75.98 1.58
std 7.81 1.71 104.64 38.49 849.40 2.76 3.68 0.81
min 9.00 3.00 68.00 46.00 1613.00 8.00 70.00 1.00
25% 17.00 4.00 105.00 75.00 2225.25 13.78 73.00 1.00
50% 22.75 4.00 151.00 93.50 2803.50 15.50 76.00 1.00
75% 29.00 8.00 275.75 126.00 3614.75 17.02 79.00 2.00
max 46.60 8.00 455.00 230.00 5140.00 24.80 82.00 3.00
# get column names of the dataframe
Auto.columns
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin', 'name'],
      dtype='object')
# seaborn scatterplot
pl = sns.jointplot(x='cylinders', y='mpg', data=Auto)

# changing data type of a column into category
Auto['cylinders'] = Auto['cylinders'].astype('category')
Auto.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           392 non-null    float64 
 1   cylinders     392 non-null    category
 2   displacement  392 non-null    float64 
 3   horsepower    392 non-null    float64 
 4   weight        392 non-null    int64   
 5   acceleration  392 non-null    float64 
 6   year          392 non-null    int64   
 7   origin        392 non-null    int64   
 8   name          392 non-null    object  
dtypes: category(1), float64(4), int64(3), object(1)
memory usage: 36.3+ KB
# seaborn boxplot 
sns.boxplot(x='cylinders', y='mpg', data=Auto)
<matplotlib.axes._subplots.AxesSubplot at 0x7f7c933c7d90>

# seaborn displot
sns.displot(x="mpg", data=Auto, bins=15)

# seaborn pairplot for selected variables, colored by another
sns.pairplot(Auto, vars=['mpg', 'displacement', 'horsepower', 'weight', 'acceleration'], hue='cylinders')

# summary statistics for all dataframe columns, including non-numerical columns
Auto.describe(include='all')
mpg cylinders displacement horsepower weight acceleration year origin name
count 392.00 392.0 392.00 392.00 392.00 392.00 392.00 392.00 392
unique NaN 5.0 NaN NaN NaN NaN NaN NaN 301
top NaN 4.0 NaN NaN NaN NaN NaN NaN amc matador
freq NaN 199.0 NaN NaN NaN NaN NaN NaN 5
mean 23.45 NaN 194.41 104.47 2977.58 15.54 75.98 1.58 NaN
std 7.81 NaN 104.64 38.49 849.40 2.76 3.68 0.81 NaN
min 9.00 NaN 68.00 46.00 1613.00 8.00 70.00 1.00 NaN
25% 17.00 NaN 105.00 75.00 2225.25 13.78 73.00 1.00 NaN
50% 22.75 NaN 151.00 93.50 2803.50 15.50 76.00 1.00 NaN
75% 29.00 NaN 275.75 126.00 3614.75 17.02 79.00 2.00 NaN
max 46.60 NaN 455.00 230.00 5140.00 24.80 82.00 3.00 NaN
# summary statistics for a single column and wrapped as dataframe for pretty table display in jupyter
pd.DataFrame(Auto['mpg'].describe())
mpg
count 392.00
mean 23.45
std 7.81
min 9.00
25% 17.00
50% 22.75
75% 29.00
max 46.60

2.5 KNN example

The following code is modified from https://github.com/empathy87/The-Elements-of-Statistical-Learning-Python-Notebooks

  1. Generates 10 means \(m_k\) from a bivariate Gaussian distrubition for each color:
    • \(N((1, 0)^T, \textbf{I})\) for BLUE
    • \(N((0, 1)^T, \textbf{I})\) for ORANGE
  2. For each color generates 100 observations as following:
    • For each observation it picks \(m_k\) at random with probability 1/10.
    • Then generates a \(N(m_k,\textbf{I}/5)\)
from sklearn.mixture import GaussianMixture
from sklearn.mixture._gaussian_mixture import _compute_precision_cholesky
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
# load training data that was used in the book
df = pd.read_csv("/content/drive/MyDrive/Lab/Data/mixture.txt")
X_train = df[['x1', 'x2']].values
y_train = df.y.values

# set the known BLUE and ORANGE clusters means
blue_means = np.array([[-0.25343316, 1.7414788], [0.26669318, 0.3712341],
                       [2.09646921, 1.2333642], [-0.06127272, -0.2086791],
                       [2.70354085, 0.5968283], [2.37721198, -1.1864147],
                       [1.05690759, -0.6838939], [0.57888354, -0.0683458],
                       [0.62425213, 0.5987384], [1.67335495, -0.2893159]])
orange_means = np.array([[1.19936869, 0.2484086], [-0.30256110, 0.9454190],
                         [0.05727232, 2.4197271], [1.32932203, 0.8192260],
                         [-0.07938424, 1.6138017], [3.50792673, 1.0529863],
                         [1.61392290, 0.6717378], [1.00753570, 1.3683071],
                         [-0.45462141, 1.0860697], [-1.79801805, 1.9297806]])
all_means = np.vstack((blue_means, orange_means))

gaussian_mixture_model = GaussianMixture(
    n_components=20,
    covariance_type='spherical',
    means_init=all_means,
    random_state=1
).fit(all_means)
# set known covariances
gaussian_mixture_model.covariances_ = [1/5]*20
# it looks like a hack, but GaussianMixture uses precisions_cholesky_
# for predict_proba method. Because we changed covariances_ we need
# to recalculate precisions_cholesky_ too.
gaussian_mixture_model.precisions_cholesky_ = _compute_precision_cholesky(
    gaussian_mixture_model.covariances_,
    gaussian_mixture_model.covariance_type)

# sample 10,000 points for testing
X_test, y_test = gaussian_mixture_model.sample(10000)
# y_test contains sampled component indices
# index < 10 means that the class is BLUE (0)
y_test = 1*(y_test >= 10)
def optimal_bayes_predict(X):
    components_proba = gaussian_mixture_model.predict_proba(X)
    # first 10 components are BLUE(0), and others are BROWN(1)
    blue_proba = np.sum(components_proba[:, :10], axis=1)
    brown_proba = np.sum(components_proba[:, 10:], axis=1)
    y_hat = 1*(blue_proba < brown_proba)
    return y_hat
bayes_error_rate = 1 - accuracy_score(y_train, optimal_bayes_predict(X_train))
print(f'The optimal Bayes error rate = {bayes_error_rate}')
The optimal Bayes error rate = 0.15000000000000002
bayes_error_rate = 1 - accuracy_score(y_test, optimal_bayes_predict(X_test))
print(f'The optimal Bayes error rate = {bayes_error_rate}')
The optimal Bayes error rate = 0.2148
# define commonly used colors
GRAY1, GRAY4, PURPLE = '#231F20', '#646369', '#A020F0'
BLUE, ORANGE, BLUE1 = '#57B5E8', '#E69E00', '#174A7E'

# configure all plots font family and border line widths
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.linewidth'] = 0.5

# prepares a plot with a title and circles representing training data
def plot_train_data(title):
    fig, ax = plt.subplots(figsize=(2.8, 2.8), dpi=110)
    ax.set_aspect(1.3)
    ax.scatter(X_train[:, 0], X_train[:, 1], s=18, facecolors='none',
               edgecolors=np.array([BLUE, ORANGE])[y_train])
    ax.tick_params(
        bottom=False, left=False, labelleft=False, labelbottom=False)
    ax.set_xlim(-2.6, 4.2)
    ax.set_ylim(-2.0, 2.9)
    fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
    ax.text(-2.6, 3.2, title, color=GRAY4, fontsize=9)
    for spine in ax.spines.values():
        spine.set_color(GRAY1)
    return fig, ax

# test it
_, _ = plot_train_data('Training data')
WARNING:matplotlib.font_manager:findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.

# given a model prediction function computes X points on n x n grid and the
# corresponding predicted classes
def fill_prediction_grid(n1, n2, predict):
    x1, x2 = np.linspace(-2.6, 4.2, n1), np.linspace(-2.0, 2.9, n2)
    X = np.transpose([np.tile(x1, n2), np.repeat(x2, n1)])
    y = predict(X)
    return X, y


# given a model prediction function computes X0 and X1 n x n meshgrids
# and the corresponing predicted classes meshgrid
def fill_prediction_meshgrid(predict):
    n1, n2 = 1000, 1000
    X, y = fill_prediction_grid(n1, n2, predict)
    return X[:, 0].reshape(n1, n2), X[:, 1].reshape(n1, n2), y.reshape(n1, n2)


# given a model prediction function plots train data, model decision
# bounary and background dots
def plot_model(predict, title):
    fig, ax = plot_train_data(title)
    # plot background dots
    X, y = fill_prediction_grid(69, 99, predict)
    ax.scatter(X[:, 0], X[:, 1], marker='.', lw=0, s=2,
               c=np.array([BLUE, ORANGE])[y])
    # plot the decision boundary
    X0, X1, Y = fill_prediction_meshgrid(predict)
    ax.contour(X0, X1, Y, [0.5], colors=GRAY1, linewidths=[0.7])
    return fig, ax

# plot the optimal Bayes decision boundary
_, _ = plot_model(optimal_bayes_predict, 'Bayes Optimal Classifier')

# lets save Bayes meshgrids for optimal decision boundary plotting
X0_bayes, X1_bayes, Y_bayes = fill_prediction_meshgrid(optimal_bayes_predict)


# given a model prediction function plots performance statistics
def plot_model_stat(predict, title):
    fig, ax = plot_model(predict, title)
    ax.contour(X0_bayes, X1_bayes, Y_bayes, [0.5], colors='purple',
               linewidths=[0.5], linestyles='dashed')
    test_error_rate = 1 - accuracy_score(y_test, predict(X_test))
    train_error_rate = 1 - accuracy_score(y_train, predict(X_train))
    parms = {'color': GRAY1, 'fontsize': 7,
             'bbox': {'facecolor': 'white', 'pad': 3, 'edgecolor': 'none'}}
    ax.text(-2.42, -1.35, f'Training Error: {train_error_rate:.3f}', **parms)
    ax.text(-2.42, -1.62, f'Test Error:       {test_error_rate:.3f}', **parms)
    ax.text(-2.42, -1.89, f'Bayes Error:    {bayes_error_rate:.3f}', **parms)
    return fig, ax
# Run GridSearchCV to find the best n_neighbors parameter using the 10-folds
# CV. It finds 12, but the book uses 15-Nearest Neighbor Classifier because
# the authors selected the most parsimonious model within one standard error
# from the best model (one standard error rule). We will apply this rule in
# other examples, not here.
k_neighbors_grid_search = GridSearchCV(
    KNeighborsClassifier(),
    {'n_neighbors': list(range(1, 50))},
    cv=10
).fit(X_train, y_train)
k_neighbors_grid_search.best_params_
{'n_neighbors': 12}
# PAGE 14. Use 15-nearest-neighbor averaging of the binary coded response as
#          the method of fitting. Thus Y-hat is the proportion of ORANGE’s in
#          the neighborhood, and so assigning class ORANGE to G-hat if
#          Y-hat>0.5 amounts to a majority vote in the neighborhood.
neighbors15_classifier = KNeighborsClassifier(
    n_neighbors=15
).fit(X_train, y_train)
_, _ = plot_model_stat(
    neighbors15_classifier.predict, '15-Nearest Neighbor Classifier')
WARNING:matplotlib.font_manager:findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.

# PAGE 16. The classes are coded as a binary variable (BLUE = 0,ORANGE = 1),
#          and then predicted by 1-nearest-neighbor classification.
neighbors1_classifier = KNeighborsClassifier(
    n_neighbors=1
).fit(X_train, y_train)
_, _ = plot_model_stat(
    neighbors1_classifier.predict, '1−Nearest Neighbor Classifier')

n_neighbors_vals = list(range(1, 30, 2))
k_neighbors_grid_search = GridSearchCV(
    KNeighborsClassifier(),
    {'n_neighbors': n_neighbors_vals},
    cv=10, scoring='accuracy',
    return_train_score=True
).fit(X_train, y_train)

train_errors, test_errors = [], []
for k in n_neighbors_vals:
    clf = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)
    train_errors.append(1 - clf.score(X_train, y_train))
    test_errors.append(1 - clf.score(X_test, y_test))
# PAGE 467. k-nearest-neighbors on the two-class mixture data. The upper
#           panel shows the misclassification errors as a function of
#           neighborhood size. Standard error bars are included for 10-fold
#           cross validation. The lower panel shows the decision boundary
#           for 7-nearest-neighbors, which appears to be optimal for minimizing
#           test error.
cv_erros = 1 - np.vstack([
    k_neighbors_grid_search.cv_results_[f'split{i}_test_score']
    for i in range(10)]).T
cv_mean_errors = np.mean(cv_erros, axis=1)
cv_std_errors = np.std(cv_erros, ddof=1, axis=1)/np.sqrt(10)
best_index = np.argmin(cv_mean_errors)
best_err, best_std_err = cv_mean_errors[best_index], cv_std_errors[best_index]

fig, ax = plt.subplots(figsize=(2.8, 2.8), dpi=110)
fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
ax.scatter(n_neighbors_vals, test_errors, c='#0000FF', s=9)
ax.plot(n_neighbors_vals, test_errors, c='#0000FF', linewidth=0.8,
        label='Test Error')
ax.plot(n_neighbors_vals, cv_mean_errors, c='#00FF00', linewidth=0.8,
        label='10-fold CV')
ax.scatter(n_neighbors_vals, train_errors, c=ORANGE, s=9)
ax.plot(n_neighbors_vals, train_errors, c=ORANGE, linewidth=0.8,
        label='Training Error')

ax.errorbar(n_neighbors_vals, cv_mean_errors,
            color='#00FF00', linestyle='None', marker='o', elinewidth=0.8,
            markersize=3, yerr=cv_std_errors, ecolor='#00FF00', capsize=3)
ax.axhline(y=best_err+best_std_err, c=PURPLE, linewidth=0.8, linestyle='--',
           label='Bayes Error')
for i in ax.get_yticklabels() + ax.get_xticklabels():
    i.set_fontsize(6)
ax.set_xlabel('Number of Neighbors', color=GRAY4, fontsize=8)
ax.set_ylabel('Misclassification Errors', color=GRAY4, fontsize=8)

neighbors7_classifier = KNeighborsClassifier(
    n_neighbors=7).fit(X_train, y_train)
plot_model_stat(neighbors7_classifier.predict, '7-Nearest Neighbors')
_ = ax.legend(loc='best', prop={'size': 8})
WARNING:matplotlib.font_manager:findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
WARNING:matplotlib.font_manager:findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.

2.6 References

[1] https://web.stanford.edu/~hastie/ElemStatLearn/

[2] https://github.com/emredjan/ISL-python

[3] https://github.com/empathy87/The-Elements-of-Statistical-Learning-Python-Notebooks